{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "07f56cab-4159-49e1-a865-0db9d78ce1d8",
   "metadata": {},
   "source": [
    "Download the [ChatGPT knowledge-graph triplet formatted datasets](https://huggingface.co/datasets/mesolitica/chatgpt-kg-triplets) from Huggingface first if you haven't already. Uncomment and run cell below to download the astroawani dataset and save it to the current working directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "4adfe897",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!wget https://huggingface.co/datasets/mesolitica/chatgpt-kg-triplets/resolve/main/kg-astroawani.translated.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ac8cd060",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "0b7edc90",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rebel_format(triplets):\n",
    "    \"\"\"\n",
    "    Convert\n",
    "    [['Bruno Santana', 'participant of', '2004 Summer Olympics'],\n",
    "    ['Bruno Santana', 'participant of', '2008 Summer Olympics'],\n",
    "    ['Bruno Santana', 'country of citizenship', 'Brazil']]\n",
    "    to rebel format,\n",
    "    <triplet> Bruno Santana <subj> 2004 Summer Olympics <obj> participant of <subj> 2008 Summer Olympics <obj> participant of <subj> Brazil <obj> country of citizenship\n",
    "    \"\"\"\n",
    "    q = []\n",
    "    for no, triple in enumerate(triplets):\n",
    "        obj = ['<obj>'] + triple[1].split()\n",
    "        subj = ['<subj>'] + triple[2].split()\n",
    "        if no > 0 and triple[0] == triplets[no - 1][0]:\n",
    "            q.extend(subj + obj)\n",
    "        else:\n",
    "            triplet = ['<triplet>'] + triple[0].split()\n",
    "            q.extend(triplet + subj + obj)\n",
    "    return ' '.join(q)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "9a593efa",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('kg-astroawani.translated.jsonl') as fopen:\n",
    "    for l in fopen:\n",
    "        l = json.loads(l)\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a592a54e-ed0a-439b-a475-5cd1572328d8",
   "metadata": {},
   "source": [
    "Let's inspect the data first before we convert it into the rebel format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "527fb15b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['id', 'title', 'description', 'body', 'title_kg', 'description_kg', 'body_kg', 'title_kg_ms', 'description_kg_ms', 'body_kg_ms'])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "85dd6b6e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'head': 'Padah', 'type': 'mempunyai', 'tail': 'hubungan sulit'},\n",
       " {'head': 'hubungan sulit', 'type': 'dengan', 'tail': 'pekerja sendiri'},\n",
       " {'head': 'Padah', 'type': 'dipecat', 'tail': \"CEO McDonald's\"}]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l['title_kg_ms']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "5575e635",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Mahkamah Rayuan Singapura pada Selasa menangguhkan keputusannya terhadap rayuan warga Malaysia Nagaentran K. Dharmalingam terhadap keputusan Mahkamah Tinggi.',\n",
       " [{'head': 'Mahkamah Rayuan Singapura',\n",
       "   'type': 'menangguhkan',\n",
       "   'tail': 'keputusannya'},\n",
       "  {'head': 'rayuan',\n",
       "   'type': 'penduduk',\n",
       "   'tail': 'Malaysia Nagaentran K. Dharmalingam'},\n",
       "  {'head': 'rayuan', 'type': 'terhadap', 'tail': 'keputusan Mahkamah Tinggi'}]]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['body_kg_ms'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "b87b4aaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('processed-kg-astroawani.jsonl', 'w') as fopen_l:\n",
    "    with open('kg-astroawani.translated.jsonl') as fopen:\n",
    "        for l in fopen:\n",
    "            data = json.loads(l)\n",
    "            \n",
    "            if data.get('title_kg_ms'):\n",
    "                triples = []\n",
    "                for t in data['title_kg_ms']:\n",
    "                    if any([t_ is None for t_ in t.values()]):\n",
    "                        continue\n",
    "                    triples.append([t['head'], t['type'], t['tail']])\n",
    "\n",
    "                if len(triples):\n",
    "                    kg = rebel_format(triples)\n",
    "                    d = {\n",
    "                        'text': data['title'].strip(),\n",
    "                        'kg': kg\n",
    "                    }\n",
    "                    fopen_l.write(f'{json.dumps(d)}\\n')\n",
    "                    fopen_l.flush()\n",
    "            \n",
    "            if data.get('description_kg_ms'):\n",
    "                triples = []\n",
    "                for t in data['description_kg_ms']:\n",
    "                    if any([t_ is None for t_ in t.values()]):\n",
    "                        continue\n",
    "                    triples.append([t['head'], t['type'], t['tail']])\n",
    "\n",
    "                if len(triples):\n",
    "                    kg = rebel_format(triples)\n",
    "                    d = {\n",
    "                        'text': data['description'].strip(),\n",
    "                        'kg': kg\n",
    "                    }\n",
    "                    fopen_l.write(f'{json.dumps(d)}\\n')\n",
    "                    fopen_l.flush()\n",
    "            \n",
    "            for body in data['body_kg_ms']:\n",
    "                if body[1]:\n",
    "                    triples = []\n",
    "                    for t in body[1]:\n",
    "                        if any([t_ is None for t_ in t.values()]):\n",
    "                            continue\n",
    "                        triples.append([t['head'], t['type'], t['tail']])\n",
    "                    \n",
    "                    if len(triples):\n",
    "                        kg = rebel_format(triples)\n",
    "                        d = {\n",
    "                            'text': body[0].strip(),\n",
    "                            'kg': kg\n",
    "                        }\n",
    "                        fopen_l.write(f'{json.dumps(d)}\\n')\n",
    "                        fopen_l.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "bae96507",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "115784 processed-kg-astroawani.jsonl\n"
     ]
    }
   ],
   "source": [
    "# Count the number of lines in the converted dataset\n",
    "!wc -l processed-kg-astroawani.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "1508d90a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"text\": \"Padah jalin hubungan sulit dengan pekerja sendiri, CEO McDonald's dipecat serta merta\", \"kg\": \"<triplet> Padah <subj> hubungan sulit <obj> mempunyai <triplet> hubungan sulit <subj> pekerja sendiri <obj> dengan <triplet> Padah <subj> CEO McDonald's <obj> dipecat\"}\n",
      "{\"text\": \"CEO tidak boleh menjalin hubungan dengan mana-mana kakitangan.\", \"kg\": \"<triplet> CEO <subj> kakitangan <obj> tidak boleh menjalin hubungan dengan\"}\n",
      "{\"text\": \"SYARIKAT rantaian makanan segera terkemuka dunia, McDonald's Corp mengesahkan telah memecat Ketua Pegawai Eksekutif (CEO), Steve Easterbrook selepas menjalinkan hubungan sulit dengan salah seorang kakitangannya.\", \"kg\": \"<triplet> <subj> yang telah memecat Steve Easterbrook <obj> mengesahkan <triplet> Steve Easterbrook <subj> hubungan yang tidak sesuai dengan pekerja <obj> telah\"}\n"
     ]
    }
   ],
   "source": [
    "!head -n 3 processed-kg-astroawani.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "3b1ef0ee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"text\": \"Padah jalin hubungan sulit dengan pekerja sendiri, CEO McDonald's dipecat serta merta\", \"kg\": \"<triplet> Padah <subj> hubungan sulit <obj> mempunyai <triplet> hubungan sulit <subj> pekerja sendiri <obj> dengan <triplet> Padah <subj> CEO McDonald's <obj> dipecat\"}\n",
      "{\"text\": \"CEO tidak boleh menjalin hubungan dengan mana-mana kakitangan.\", \"kg\": \"<triplet> CEO <subj> kakitangan <obj> tidak boleh menjalin hubungan dengan\"}\n",
      "{\"text\": \"SYARIKAT rantaian makanan segera terkemuka dunia, McDonald's Corp mengesahkan telah memecat Ketua Pegawai Eksekutif (CEO), Steve Easterbrook selepas menjalinkan hubungan sulit dengan salah seorang kakitangannya.\", \"kg\": \"<triplet> <subj> yang telah memecat Steve Easterbrook <obj> mengesahkan <triplet> Steve Easterbrook <subj> hubungan yang tidak sesuai dengan pekerja <obj> telah\"}\n"
     ]
    }
   ],
   "source": [
    "!head -n 3 processed-kg-astroawani.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c07349d-9dbe-424a-a537-e3eab0fc173d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
